
/*
Clean Census covariates at both zip and county level level
*/

* ++++++++++++++++++++++
* ZIP - LEVEL
* ++++++++++++++++++++++

* covariates on demographics, racial composition, socio-economic situation and 
* structure of local economy 
* from most recent ACS, i.e. 2014-2018 5-year ACS
import delimited "${data_raw_covs}/nhgis0061_csv/nhgis0061_ds239_20185_2018_zcta.csv", clear

* give variables meaningful names
* and create totals
* population/demographics
ren ajwbe001 pop
ren ajwbe002 men
egen men_below_18 = rowtotal(ajwbe003-ajwbe006)
egen men_18_24 = rowtotal(ajwbe007-ajwbe010)
egen men_25_34 = rowtotal(ajwbe011-ajwbe012)
egen men_35_44 = rowtotal(ajwbe013-ajwbe014)
egen men_45_54 = rowtotal(ajwbe015-ajwbe016)
egen men_55_64 = rowtotal(ajwbe017-ajwbe019)
egen men_65_74 = rowtotal(ajwbe020-ajwbe022)
egen men_75_and_above = rowtotal(ajwbe023-ajwbe025)
ren ajwbe026 women
egen women_below_18 = rowtotal(ajwbe027-ajwbe030)
egen women_18_24 = rowtotal(ajwbe031-ajwbe034)
egen women_25_34 = rowtotal(ajwbe035-ajwbe036)
egen women_35_44 = rowtotal(ajwbe037-ajwbe038)
egen women_45_54 = rowtotal(ajwbe039-ajwbe040)
egen women_55_64 = rowtotal(ajwbe041-ajwbe043)
egen women_65_74 = rowtotal(ajwbe044-ajwbe046)
egen women_75_and_above = rowtotal(ajwbe047-ajwbe049)

* race
ren ajwne002 white
ren ajwne003 black
ren ajwne005 asian

* education
gen hs_ged = ajype017 + ajype018
gen some_coll = ajype019 + ajype020 + ajype021 
gen coll = ajype022 + ajype023 + ajype024 + ajype025

* income
ren ajzae001 med_hh_inc

* structure of local economy
ren aj1fe003 male_mngmt_bus_sci_arts
ren aj1fe019 male_service_occ
ren aj1fe027 male_sales_office
ren aj1fe030 male_natres_cons_maint 
ren aj1fe034 male_prod_trans

ren aj1fe039 female_mngmt_bus_sci_arts
ren aj1fe055 female_service_occ
ren aj1fe063 female_sales_office
ren aj1fe066 female_natres_cons_maint 
ren aj1fe070 female_prod_trans
 
* save key variables only
keep zcta5a pop men women white black asian med_hh_inc male_mngmt_bus_sci_arts male_service_occ male_sales_office male_natres_cons_maint male_prod_trans female_mngmt_bus_sci_arts female_service_occ female_sales_office female_natres_cons_maint female_prod_trans men_* women_* hs_ged some_coll coll

ren zcta5a zcta 

tempfile covs1
save `covs1'

* covariates on (high-speed) internet access, also from 2014-2018 ACS
import delimited "${data_raw_covs}/nhgis0062_csv/nhgis0062_ds239_20185_2018_zcta.csv", clear

* give variables meaningful names
keep zcta5a aj38e001 aj38e002 aj38e004
ren (zcta5a aj38e001 aj38e002 aj38e004) (zcta households computer broadband)
gen frac_computer = computer / households
gen frac_broadband = broadband / households

* combine with other Census covariates from above
merge 1:1 zcta using `covs1', nogen assert(3)

tempfile covs2
save `covs2'

* add zip level area, this will allow me to calculate population population density
* comes from output from QGIS (based on Tiger/Line zip shapefile)
import delimited "${data_raw_covs}/zip_area.csv", clear

keep zcta5ce10 aland10
ren (zcta5ce10 aland10) (zcta land_area)

*  combine with other Census covariates from above
merge 1:1 zcta using `covs2', nogen keep(3) assert(1 3)

* calculate population density
gen pop_density = pop / land_area 

*  clean up and save
drop land_area

save "${data_derived_covs}/zip_covariates.dta", replace
export delimited "${data_derived_covs}/zip_covariates.csv", replace


* ++++++++++++++++++++++
* COUNTY - LEVEL
* ++++++++++++++++++++++

* area of counties
* comes from shapefile of 2018 counties from Census bureau --> land area
* this file contains attribute table of that shapefile
use "${data_raw_covs}/us_counties2018_data.dta", clear

* focus on land area only because people rarely live in the water and we need this to construct
* population density variable
keep GEOID ALAND 
ren (GEOID ALAND) (county land_area)
destring county, replace

tempfile cty_land_area
save `cty_land_area'

* urban/rural 
* from 2010 Census (beacuse not included in 2018-2014 ACS)
import delimited "${data_raw_covs}/nhgis0063_csv/nhgis0063_ds172_2010_county.csv", clear

* total and urban population
ren (h7w001 h7w002) (tot_pop_2010 pop_urban_2010)
gen frac_urban_2010 = pop_urban_2010 / tot_pop_2010

* keep key variables only
keep statea countya *2010

* create county identifier
gen county = string(statea, "%02.0f") + string(countya, "%03.0f")
destring county, replace 
drop *a

tempfile urban
save `urban'

* income and population from ACS 2018-2014
import delimited "${data_raw_covs}/nhgis0063_csv/nhgis0063_ds239_20185_2018_county.csv", clear

* total population and median household income
ren (ajwme001 ajzae001) (tot_pop_2018 med_hh_inc_2018)

* keep key variables only
keep statea countya *2018

* create county identifier
gen county = string(statea, "%02.0f") + string(countya, "%03.0f")
destring county, replace 
drop *a 

* combine files
merge 1:1 county using `urban', nogen assert(1 2 3)
merge 1:1 county using `cty_land_area', nogen assert(1 2 3) keep(1 3)

* population density 
gen pop_density_2018 = tot_pop_2018 / land_area
assert pop_density_2018 != . if tot_pop_2018!=.

* clean up and save 
keep county tot_pop_2018 med_hh_inc_2018 pop_density_2018 pop_urban_2010 frac_urban_2010
order county tot_pop_2018 med_hh_inc_2018 pop_density_2018 pop_urban_2010 frac_urban_2010

save "${data_derived_covs}/county_covariates.dta", replace
export delimited "${data_derived_covs}/county_covariates.csv", replace



